{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstrating Hugging Face Modin Interoperability\n",
    "## All the examples in this section are taken/ adapted from https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface/notebook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import modin.pandas as pd\n",
    "import numpy as np # linear algebra"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import sklearn\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import urllib.request\n",
    "url_path = \"https://modin-datasets.intel.com/testing/IMDB_Dataset.csv\"\n",
    "urllib.request.urlretrieve(url_path, \"imdb.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "modin_df = pd.read_csv(\"imdb.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modin_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(modin_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "modin_df.sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import BertTokenizer, TFBertForSequenceClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Loading the BERT Classifier and Tokenizer along with Input module\n",
    "from transformers import InputExample, InputFeatures\n",
    "\n",
    "model = TFBertForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n",
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# changing positive and negative into numeric values\n",
    "\n",
    "def cat2num(value):\n",
    "    if value=='positive': \n",
    "        return 1\n",
    "    else: \n",
    "        return 0\n",
    "    \n",
    "modin_df['sentiment']  =  modin_df['sentiment'].apply(cat2num)\n",
    "train = modin_df[:45000]\n",
    "test = modin_df[45000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# But first see BERT tokenizer exmaples and other required stuff!\n",
    "\n",
    "example='In this Kaggle notebook, I will do sentiment analysis using BERT with Huggingface'\n",
    "tokens=tokenizer.tokenize(example)\n",
    "token_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
    "print(tokens)\n",
    "print(token_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "type(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_data_to_examples(train, test, review, sentiment): \n",
    "    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n",
    "                                                          text_a = x[review], \n",
    "                                                          label = x[sentiment]), axis = 1)\n",
    "\n",
    "    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case\n",
    "                                                          text_a = x[review], \n",
    "                                                          label = x[sentiment]), axis = 1,)\n",
    "  \n",
    "    return train_InputExamples, validation_InputExamples\n",
    "\n",
    "train_InputExamples, validation_InputExamples = convert_data_to_examples(train,  test, 'review',  'sentiment')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):\n",
    "    features = [] # -> will hold InputFeatures to be converted later\n",
    "\n",
    "    for e in tqdm(examples):\n",
    "        input_dict = tokenizer.encode_plus(\n",
    "            e.text_a,\n",
    "            add_special_tokens=True,    # Add 'CLS' and 'SEP'\n",
    "            max_length=max_length,    # truncates if len(s) > max_length\n",
    "            return_token_type_ids=True,\n",
    "            return_attention_mask=True,\n",
    "            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length\n",
    "            truncation=True\n",
    "        )\n",
    "\n",
    "        input_ids, token_type_ids, attention_mask = (input_dict[\"input_ids\"],input_dict[\"token_type_ids\"], input_dict['attention_mask'])\n",
    "        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label) )\n",
    "\n",
    "    def gen():\n",
    "        for f in features:\n",
    "            yield (\n",
    "                {\n",
    "                    \"input_ids\": f.input_ids,\n",
    "                    \"attention_mask\": f.attention_mask,\n",
    "                    \"token_type_ids\": f.token_type_ids,\n",
    "                },\n",
    "                f.label,\n",
    "            )\n",
    "\n",
    "    return tf.data.Dataset.from_generator(\n",
    "        gen,\n",
    "        ({\"input_ids\": tf.int32, \"attention_mask\": tf.int32, \"token_type_ids\": tf.int32}, tf.int64),\n",
    "        (\n",
    "            {\n",
    "                \"input_ids\": tf.TensorShape([None]),\n",
    "                \"attention_mask\": tf.TensorShape([None]),\n",
    "                \"token_type_ids\": tf.TensorShape([None]),\n",
    "            },\n",
    "            tf.TensorShape([]),\n",
    "        ),\n",
    "    )\n",
    "\n",
    "\n",
    "DATA_COLUMN = 'review'\n",
    "LABEL_COLUMN = 'sentiment'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_InputExamples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)\n",
    "train_data = train_data.shuffle(100).batch(32).repeat(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)\n",
    "validation_data = validation_data.batch(32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), \n",
    "              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), \n",
    "              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.fit(train_data, epochs=2, validation_data=validation_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_sentences = ['worst movie of my life, will never watch movies from this series', \n",
    "                  'Wow, blew my mind, what a movie by Marvel, animation and story is amazing']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')   # we are tokenizing before sending into our trained model\n",
    "tf_outputs = model(tf_batch)                                  \n",
    "tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)       # axis=-1, this means that the index that will be returned by argmax will be taken from the *last* axis.\n",
    "labels = ['Negative','Positive']\n",
    "label = tf.argmax(tf_predictions, axis=1)\n",
    "label = label.numpy()\n",
    "for i in range(len(pred_sentences)):\n",
    "    print(pred_sentences[i], \": \", labels[label[i]])"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "9752fa87da8bf164654ccc33a595e9110c8fc9bb15d763374a7037fd32519b1f"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
